library(rio)
## Warning: package 'rio' was built under R version 4.4.3
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(viridis)
## Loading required package: viridisLite
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
## 
##     discard
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:rio':
## 
##     export
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.4.3
library(clustertend)
## Package `clustertend` is deprecated.  Use package `hopkins` instead.
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# install.packages("ROSE")
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4

Modelos con Algoritmos

Determinamos que la variable respuesta para nuestra investigación sería el ratio que describimos con anterioridad en el análisis exploratorio entre la edad de la víctima y el agresor. La cual se encuentra definida como:

\[ q=\frac{edad\ victima}{edad\ agresor} \]

Este nuevo indicador “q” lo dividimos en 3 categorías, según qué tanta diferencia de edad se encontró entre víctimas y agresores. Las categrías son las siguientes:

División de los datos

Primero, tenemos que unir todos los datasets de los años anteriores, entonces

original <- import("Datos/2023.sav")

addToDataset <- function(year) {
  data <- import(paste("Datos/", year, ".sav", sep=""))
  # reduced <- data[,colnames(original)]
  original <<- bind_rows(original, data)
}

ifValueConvertToNA <- function(column, values) {
  # print(paste("Removing ignored values from:", column))
  original[,c(column)] <<- ifelse(original[,c(column)] %in% values, NA, original[,c(column)])
}

for (year in 2013:2022) {
  addToDataset(year)
}

ignoredValues <- c(9, 99, 999, 9999)
affectedColumns <- c(
  "VIC_EDAD",
  "TOTAL_HIJOS",
  "NUM_HIJ_HOM",
  "NUM_HIJ_MUJ",
  "VIC_ALFAB",
  "VIC_ESCOLARIDAD",
  "VIC_EST_CIV",
  "VIC_GRUPET",
  "VIC_NACIONAL",
  "VIC_TRABAJA",
  "VIC_OCUP",
  "VIC_DEDICA",
  "VIC_DISC",
  "TIPO_DISCAQ",
  "OTRAS_VICTIMAS",
  "VIC_OTRAS_HOM",
  "VIC_OTRAS_MUJ",
  "VIC_OTRAS_N_OS",
  "VIC_OTRAS_N_AS",
  "HEC_DIA",
  "HEC_MES",
  "HEC_ANO",
  "HEC_DEPTO",
  "HEC_DEPTOMCPIO",
  "HEC_AREA",
  "HEC_RECUR_DENUN",
  "INST_DONDE_DENUNCIO",
  "AGR_EDAD",
  "AGR_ALFAB",
  "AGR_ESCOLARIDAD",
  "AGR_EST_CIV",
  "AGR_GURPET",
  "AGR_NACIONAL",
  "AGR_TRABAJA",
  "AGR_OCUP",
  "AGR_DEDICA",
  "AGRESORES_OTROS_TOTAL",
  "AGR_OTROS_HOM",
  "AGR_OTRAS_MUJ",
  "AGR_OTROS_N_OS",
  "AGR_OTRAS_N_AS",
  "CONDUCENTE",
  "LEY_APLICABLE",
  "ARTICULOVIF1",
  "ARTICULOVIF2",
  "ARTICULOVIF3",
  "ARTICULOVIF4",
  "ARTICULOVCM1",
  "ARTICULOVCM2",
  "ARTICULOVCM3",
  "ARTICULOVCM4",
  "ARTICULOCODPEN1",
  "ARTICULOCODPEN2",
  "ARTICULOCODPEN3",
  "ARTICULOCODPEN4",
  "ARTICULOTRAS1",
  "ARTICULOTRAS2",
  "ARTICULOTRAS3",
  "ARTICULOTRAS4",
  "MEDIDAS_SEGURIDAD",
  "ORGANISMO_REMITE",
  "QUIEN_REPORTA",
  "ORGANISMO_JURISDICCIONAL"
)


for (col in affectedColumns) {
    ifValueConvertToNA(col, ignoredValues)
}

# Ignorar también TIPO_MEDIDA, se ignora con valor z
ifValueConvertToNA("TIPO_MEDIDA", c("z"))

# Por alguna razón se crea esta columna, todos sus valores son NAN así que la borramos.
original$`filter_$` <- NULL

summary(original)
##     HEC_DIA         HEC_MES         HEC_ANO       HEC_DEPTO     
##  Min.   : 1.00   Min.   : 1.00   Min.   :2000   Min.   : 1.0    
##  1st Qu.: 7.00   1st Qu.: 3.00   1st Qu.:2015   1st Qu.: 1.0    
##  Median :15.00   Median : 6.00   Median :2018   Median :10.0    
##  Mean   :15.33   Mean   : 6.19   Mean   :2018   Mean   : 8.9    
##  3rd Qu.:23.00   3rd Qu.:10.00   3rd Qu.:2021   3rd Qu.:16.0    
##  Max.   :31.00   Max.   :12.00   Max.   :2023   Max.   :22.0    
##  NA's   :16084   NA's   :33585   NA's   :4170   NA's   :330334  
##  HEC_DEPTOMCPIO    HEC_TIPAGRE   NUMERO_BOLETA     DIA_EMISION   
##  Min.   : 101.0   Min.   :1111   Min.   :    0    Min.   : 1.00  
##  1st Qu.: 311.0   1st Qu.:1122   1st Qu.:   40    1st Qu.: 8.00  
##  Median :1003.0   Median :1222   Median :   95    Median :15.00  
##  Mean   : 961.3   Mean   :1603   Mean   : 1057    Mean   :15.32  
##  3rd Qu.:1601.0   3rd Qu.:2122   3rd Qu.:  363    3rd Qu.:23.00  
##  Max.   :2217.0   Max.   :2221   Max.   :17020    Max.   :31.00  
##  NA's   :1859                    NA's   :254152                  
##   MES_EMISION      ANO_EMISION       DEPTO         DEPTO_MCPIO    
##  Min.   : 1.000   Min.   :2013   Min.   : 1.0     Min.   : 101.0  
##  1st Qu.: 4.000   1st Qu.:2015   1st Qu.: 1.0     1st Qu.: 309.0  
##  Median : 6.000   Median :2018   Median : 9.0     Median :1003.0  
##  Mean   : 6.421   Mean   :2018   Mean   : 8.7     Mean   : 958.3  
##  3rd Qu.: 9.000   3rd Qu.:2021   3rd Qu.:15.0     3rd Qu.:1601.0  
##  Max.   :12.000   Max.   :2023   Max.   :22.0     Max.   :2217.0  
##                                  NA's   :327781                   
##  QUIEN_REPORTA      VIC_SEXO        VIC_EDAD      TOTAL_HIJOS   
##  Min.   :1.000   Min.   :1.000   Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:24.00   1st Qu.: 1.00  
##  Median :1.000   Median :2.000   Median :31.00   Median : 2.00  
##  Mean   :1.031   Mean   :1.878   Mean   :33.63   Mean   : 2.08  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:40.00   3rd Qu.: 3.00  
##  Max.   :3.000   Max.   :2.000   Max.   :98.00   Max.   :19.00  
##  NA's   :4362                    NA's   :5635    NA's   :75236  
##   NUM_HIJ_HOM     NUM_HIJ_MUJ      VIC_ALFAB     VIC_ESCOLARIDAD
##  Min.   : 0.00   Min.   : 0.00   Min.   :1.000   Min.   :10.0   
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.:1.000   1st Qu.:23.0   
##  Median : 1.00   Median : 1.00   Median :1.000   Median :29.0   
##  Mean   : 1.08   Mean   : 1.01   Mean   :1.163   Mean   :29.7   
##  3rd Qu.: 2.00   3rd Qu.: 2.00   3rd Qu.:1.000   3rd Qu.:39.0   
##  Max.   :14.00   Max.   :14.00   Max.   :2.000   Max.   :59.0   
##  NA's   :74409   NA's   :74364   NA's   :3326    NA's   :12268  
##   VIC_EST_CIV      VIC_GRUPET     VIC_NACIONAL    VIC_TRABAJA  
##  Min.   :1.00    Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:2.00    1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :2.00    Median :1.000   Median :1.000   Median :2.00  
##  Mean   :2.28    Mean   :1.921   Mean   :1.005   Mean   :1.66  
##  3rd Qu.:3.00    3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.00  
##  Max.   :5.00    Max.   :6.000   Max.   :2.000   Max.   :2.00  
##  NA's   :71927   NA's   :5478    NA's   :2488    NA's   :2645  
##     VIC_OCUP        VIC_DEDICA        VIC_DISC      TIPO_DISCAQ    
##  Min.   : 110     Min.   :1.0      Min.   :1.000   Min.   :1.0     
##  1st Qu.:5142     1st Qu.:1.0      1st Qu.:2.000   1st Qu.:2.0     
##  Median :5311     Median :1.0      Median :2.000   Median :3.0     
##  Mean   :6258     Mean   :1.1      Mean   :1.992   Mean   :3.3     
##  3rd Qu.:9111     3rd Qu.:1.0      3rd Qu.:2.000   3rd Qu.:6.0     
##  Max.   :9998     Max.   :6.0      Max.   :2.000   Max.   :6.0     
##  NA's   :245216   NA's   :129152   NA's   :16136   NA's   :363397  
##   VIC_REL_AGR     OTRAS_VICTIMAS   VIC_OTRAS_HOM    VIC_OTRAS_MUJ   
##  Min.   : 1.000   Min.   : 0.00    Min.   :0.00     Min.   : 0.00   
##  1st Qu.: 1.000   1st Qu.: 0.00    1st Qu.:0.00     1st Qu.: 0.00   
##  Median : 2.000   Median : 0.00    Median :0.00     Median : 0.00   
##  Mean   : 3.446   Mean   : 0.85    Mean   :0.09     Mean   : 0.15   
##  3rd Qu.: 4.000   3rd Qu.: 1.00    3rd Qu.:0.00     3rd Qu.: 0.00   
##  Max.   :10.000   Max.   :19.00    Max.   :8.00     Max.   :14.00   
##                   NA's   :144298   NA's   :144107   NA's   :144110  
##  VIC_OTRAS_N_OS   VIC_OTRAS_N_AS      HEC_AREA     HEC_RECUR_DENUN
##  Min.   : 0.00    Min.   :0.0      Min.   :1.000   Min.   :1.000  
##  1st Qu.: 0.00    1st Qu.:0.0      1st Qu.:1.000   1st Qu.:2.000  
##  Median : 0.00    Median :0.0      Median :1.000   Median :2.000  
##  Mean   : 0.32    Mean   :0.3      Mean   :1.426   Mean   :1.884  
##  3rd Qu.: 0.00    3rd Qu.:0.0      3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :11.00    Max.   :8.0      Max.   :2.000   Max.   :2.000  
##  NA's   :144109   NA's   :144106   NA's   :12853   NA's   :13702  
##  INST_DONDE_DENUNCIO    AGR_SEXO        AGR_EDAD       AGR_ALFAB    
##  Min.   :1.0         Min.   :1.000   Min.   : 7.00   Min.   :1.000  
##  1st Qu.:3.0         1st Qu.:1.000   1st Qu.:26.00   1st Qu.:1.000  
##  Median :4.0         Median :1.000   Median :33.00   Median :1.000  
##  Mean   :3.1         Mean   :1.152   Mean   :34.56   Mean   :1.121  
##  3rd Qu.:4.0         3rd Qu.:1.000   3rd Qu.:40.00   3rd Qu.:1.000  
##  Max.   :6.0         Max.   :2.000   Max.   :98.00   Max.   :2.000  
##  NA's   :329334                      NA's   :24014   NA's   :9183   
##  AGR_ESCOLARIDAD  AGR_EST_CIV      AGR_GURPET     AGR_NACIONAL  
##  Min.   :10.00   Min.   :1.00    Min.   :1.000   Min.   :1.000  
##  1st Qu.:24.00   1st Qu.:2.00    1st Qu.:1.000   1st Qu.:1.000  
##  Median :29.00   Median :2.00    Median :1.000   Median :1.000  
##  Mean   :30.26   Mean   :2.28    Mean   :1.941   Mean   :1.004  
##  3rd Qu.:39.00   3rd Qu.:3.00    3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :59.00   Max.   :5.00    Max.   :6.000   Max.   :2.000  
##  NA's   :21433   NA's   :72192   NA's   :6834    NA's   :10559  
##   AGR_TRABAJA       AGR_OCUP        AGR_DEDICA     AGRESORES_OTROS_TOTAL
##  Min.   :1.000   Min.   : 110     Min.   :1.00     Min.   : 0.00        
##  1st Qu.:1.000   1st Qu.:5414     1st Qu.:1.00     1st Qu.: 0.00        
##  Median :1.000   Median :6111     Median :1.00     Median : 0.00        
##  Mean   :1.207   Mean   :6890     Mean   :1.82     Mean   : 0.21        
##  3rd Qu.:1.000   3rd Qu.:9111     3rd Qu.:3.00     3rd Qu.: 0.00        
##  Max.   :2.000   Max.   :9998     Max.   :6.00     Max.   :15.00        
##  NA's   :14964   NA's   :100602   NA's   :304722   NA's   :187468       
##  AGR_OTROS_HOM    AGR_OTRAS_MUJ    AGR_OTROS_N_OS   AGR_OTRAS_N_AS  
##  Min.   :0.00     Min.   :0.0      Min.   :0.00     Min.   :0.00    
##  1st Qu.:0.00     1st Qu.:0.0      1st Qu.:0.00     1st Qu.:0.00    
##  Median :0.00     Median :0.0      Median :0.00     Median :0.00    
##  Mean   :0.07     Mean   :0.1      Mean   :0.02     Mean   :0.01    
##  3rd Qu.:0.00     3rd Qu.:0.0      3rd Qu.:0.00     3rd Qu.:0.00    
##  Max.   :8.00     Max.   :8.0      Max.   :7.00     Max.   :6.00    
##  NA's   :187462   NA's   :187462   NA's   :187461   NA's   :187461  
##  INST_DENUN_HECHO ORGANISMO_JURISDICCIONAL   CONDUCENTE     LEY_APLICABLE   
##  Min.   :1.000    Min.   : 1.00            Min.   :1.00     Min.   :1.00    
##  1st Qu.:3.000    1st Qu.: 1.00            1st Qu.:1.00     1st Qu.:1.00    
##  Median :4.000    Median : 1.00            Median :1.00     Median :1.00    
##  Mean   :3.443    Mean   : 4.73            Mean   :1.38     Mean   :1.74    
##  3rd Qu.:4.000    3rd Qu.: 7.00            3rd Qu.:2.00     3rd Qu.:3.00    
##  Max.   :6.000    Max.   :16.00            Max.   :2.00     Max.   :6.00    
##                   NA's   :240778           NA's   :249954   NA's   :170923  
##   ARTICULOVIF1     ARTICULOVIF2     ARTICULOVIF3     ARTICULOVIF4   
##  Min.   : 1.00    Min.   : 0.00    Min.   : 0.00    Min.   : 0.00   
##  1st Qu.: 7.00    1st Qu.: 0.00    1st Qu.: 0.00    1st Qu.: 0.00   
##  Median : 7.00    Median : 0.00    Median : 0.00    Median : 0.00   
##  Mean   : 6.71    Mean   : 0.35    Mean   : 0.24    Mean   : 0.26   
##  3rd Qu.: 7.00    3rd Qu.: 0.00    3rd Qu.: 0.00    3rd Qu.: 0.00   
##  Max.   :10.00    Max.   :15.00    Max.   :17.00    Max.   :16.00   
##  NA's   :244073   NA's   :244881   NA's   :244922   NA's   :244164  
##   ARTICULOVCM1     ARTICULOVCM2     ARTICULOVCM3     ARTICULOVCM4   
##  Min.   : 0.0     Min.   : 0.0     Min.   : 0.0     Min.   : 0.0    
##  1st Qu.: 7.0     1st Qu.: 0.0     1st Qu.: 0.0     1st Qu.: 0.0    
##  Median : 7.0     Median : 0.0     Median : 0.0     Median : 0.0    
##  Mean   : 6.6     Mean   : 0.6     Mean   : 0.2     Mean   : 0.3    
##  3rd Qu.: 7.0     3rd Qu.: 0.0     3rd Qu.: 0.0     3rd Qu.: 0.0    
##  Max.   :25.0     Max.   :25.0     Max.   :25.0     Max.   :25.0    
##  NA's   :328046   NA's   :325569   NA's   :324914   NA's   :324774  
##  ARTICULOCODPEN1  ARTICULOCODPEN2  ARTICULOCODPEN3  ARTICULOCODPEN4 
##  Min.   :  1.0    Min.   :  0      Min.   :  0.0    Min.   :  0.0   
##  1st Qu.:203.0    1st Qu.:  0      1st Qu.:  0.0    1st Qu.:  0.0   
##  Median :215.0    Median :  0      Median :  0.0    Median :  0.0   
##  Mean   :312.2    Mean   : 29      Mean   :  1.4    Mean   :  0.5   
##  3rd Qu.:482.0    3rd Qu.:  0      3rd Qu.:  0.0    3rd Qu.:  0.0   
##  Max.   :495.0    Max.   :494      Max.   :257.0    Max.   :205.0   
##  NA's   :364280   NA's   :364280   NA's   :364280   NA's   :364280  
##  ARTICULOTRAS1    ARTICULOTRAS2    ARTICULOTRAS3    ARTICULOTRAS4   
##  Min.   :141.0    Min.   :  0.0    Min.   :0        Min.   :0       
##  1st Qu.:141.0    1st Qu.:142.0    1st Qu.:0        1st Qu.:0       
##  Median :141.0    Median :142.0    Median :0        Median :0       
##  Mean   :147.4    Mean   :113.6    Mean   :0        Mean   :0       
##  3rd Qu.:141.0    3rd Qu.:142.0    3rd Qu.:0        3rd Qu.:0       
##  Max.   :173.0    Max.   :142.0    Max.   :0        Max.   :0       
##  NA's   :365124   NA's   :365124   NA's   :365124   NA's   :365124  
##  MEDIDAS_SEGURIDAD TIPO_MEDIDA        ORGANISMO_REMITE
##  Min.   :1         Length:365129      Min.   : 1.00   
##  1st Qu.:1         Class :character   1st Qu.:17.00   
##  Median :1         Mode  :character   Median :17.00   
##  Mean   :1                            Mean   :15.71   
##  3rd Qu.:1                            3rd Qu.:18.00   
##  Max.   :2                            Max.   :19.00   
##  NA's   :171957                       NA's   :277781

Ahora creamos la variable respuesta:

edad_agr_vic <- original %>%
  filter(!is.na(VIC_EDAD) & !is.na(AGR_EDAD)) %>%
  mutate(
    vicRatioAgr = VIC_EDAD / AGR_EDAD,
    diferenciaEdad = ifelse(vicRatioAgr <= 0.8, "Mucho menor", ifelse(vicRatioAgr <= 1.2, "Similar", "Mucho mayor")) 
  )

summary(edad_agr_vic)
##     HEC_DIA         HEC_MES          HEC_ANO       HEC_DEPTO     
##  Min.   : 1.00   Min.   : 1.000   Min.   :2000   Min.   : 1.00   
##  1st Qu.: 7.00   1st Qu.: 3.000   1st Qu.:2015   1st Qu.: 2.00   
##  Median :15.00   Median : 6.000   Median :2018   Median :10.00   
##  Mean   :15.31   Mean   : 6.182   Mean   :2018   Mean   : 9.11   
##  3rd Qu.:23.00   3rd Qu.:10.000   3rd Qu.:2021   3rd Qu.:16.00   
##  Max.   :31.00   Max.   :12.000   Max.   :2023   Max.   :22.00   
##  NA's   :14243   NA's   :30375    NA's   :3242   NA's   :306113  
##  HEC_DEPTOMCPIO    HEC_TIPAGRE   NUMERO_BOLETA     DIA_EMISION   
##  Min.   : 101.0   Min.   :1111   Min.   :    0    Min.   : 1.00  
##  1st Qu.: 312.0   1st Qu.:1122   1st Qu.:   39    1st Qu.: 8.00  
##  Median :1004.0   Median :1222   Median :   93    Median :15.00  
##  Mean   : 963.8   Mean   :1600   Mean   : 1122    Mean   :15.31  
##  3rd Qu.:1601.0   3rd Qu.:2122   3rd Qu.:  357    3rd Qu.:23.00  
##  Max.   :2217.0   Max.   :2221   Max.   :17020    Max.   :31.00  
##  NA's   :1241                    NA's   :236858                  
##   MES_EMISION      ANO_EMISION       DEPTO         DEPTO_MCPIO    
##  Min.   : 1.000   Min.   :2013   Min.   : 1.00    Min.   : 101.0  
##  1st Qu.: 4.000   1st Qu.:2015   1st Qu.: 2.00    1st Qu.: 312.0  
##  Median : 6.000   Median :2018   Median : 9.00    Median :1004.0  
##  Mean   : 6.414   Mean   :2018   Mean   : 8.99    Mean   : 961.8  
##  3rd Qu.: 9.000   3rd Qu.:2021   3rd Qu.:15.00    3rd Qu.:1601.0  
##  Max.   :12.000   Max.   :2023   Max.   :22.00    Max.   :2217.0  
##                                  NA's   :303902                   
##  QUIEN_REPORTA      VIC_SEXO        VIC_EDAD      TOTAL_HIJOS   
##  Min.   :1.000   Min.   :1.000   Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:24.00   1st Qu.: 1.00  
##  Median :1.000   Median :2.000   Median :31.00   Median : 2.00  
##  Mean   :1.029   Mean   :1.878   Mean   :33.56   Mean   : 2.09  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:40.00   3rd Qu.: 3.00  
##  Max.   :3.000   Max.   :2.000   Max.   :98.00   Max.   :19.00  
##  NA's   :3679                                    NA's   :62215  
##   NUM_HIJ_HOM     NUM_HIJ_MUJ      VIC_ALFAB     VIC_ESCOLARIDAD
##  Min.   : 0.00   Min.   : 0.00   Min.   :1.000   Min.   :10.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.:1.000   1st Qu.:23.00  
##  Median : 1.00   Median : 1.00   Median :1.000   Median :29.00  
##  Mean   : 1.09   Mean   : 1.02   Mean   :1.162   Mean   :29.72  
##  3rd Qu.: 2.00   3rd Qu.: 2.00   3rd Qu.:1.000   3rd Qu.:39.00  
##  Max.   :14.00   Max.   :14.00   Max.   :2.000   Max.   :59.00  
##  NA's   :61418   NA's   :61381   NA's   :2068    NA's   :8208   
##   VIC_EST_CIV      VIC_GRUPET     VIC_NACIONAL    VIC_TRABAJA   
##  Min.   :1.00    Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.00    1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.00    Median :1.000   Median :1.000   Median :2.000  
##  Mean   :2.29    Mean   :1.901   Mean   :1.005   Mean   :1.657  
##  3rd Qu.:3.00    3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :5.00    Max.   :6.000   Max.   :2.000   Max.   :2.000  
##  NA's   :65122   NA's   :4350    NA's   :1965    NA's   :1815   
##     VIC_OCUP        VIC_DEDICA        VIC_DISC      TIPO_DISCAQ    
##  Min.   : 110     Min.   :1.0      Min.   :1.000   Min.   :1.0     
##  1st Qu.:5142     1st Qu.:1.0      1st Qu.:2.000   1st Qu.:2.0     
##  Median :5249     Median :1.0      Median :2.000   Median :3.0     
##  Mean   :6255     Mean   :1.1      Mean   :1.992   Mean   :3.3     
##  3rd Qu.:9111     3rd Qu.:1.0      3rd Qu.:2.000   3rd Qu.:5.0     
##  Max.   :9998     Max.   :6.0      Max.   :2.000   Max.   :6.0     
##  NA's   :225247   NA's   :119518   NA's   :13392   NA's   :335976  
##   VIC_REL_AGR     OTRAS_VICTIMAS   VIC_OTRAS_HOM    VIC_OTRAS_MUJ   
##  Min.   : 1.000   Min.   : 0.00    Min.   :0.00     Min.   : 0.00   
##  1st Qu.: 1.000   1st Qu.: 0.00    1st Qu.:0.00     1st Qu.: 0.00   
##  Median : 2.000   Median : 0.00    Median :0.00     Median : 0.00   
##  Mean   : 3.399   Mean   : 0.85    Mean   :0.09     Mean   : 0.15   
##  3rd Qu.: 3.000   3rd Qu.: 1.00    3rd Qu.:0.00     3rd Qu.: 0.00   
##  Max.   :10.000   Max.   :19.00    Max.   :8.00     Max.   :14.00   
##                   NA's   :127533   NA's   :127348   NA's   :127351  
##  VIC_OTRAS_N_OS   VIC_OTRAS_N_AS      HEC_AREA     HEC_RECUR_DENUN
##  Min.   : 0.00    Min.   :0.0      Min.   :1.000   Min.   :1.000  
##  1st Qu.: 0.00    1st Qu.:0.0      1st Qu.:1.000   1st Qu.:2.000  
##  Median : 0.00    Median :0.0      Median :1.000   Median :2.000  
##  Mean   : 0.32    Mean   :0.3      Mean   :1.428   Mean   :1.884  
##  3rd Qu.: 0.00    3rd Qu.:0.0      3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :11.00    Max.   :8.0      Max.   :2.000   Max.   :2.000  
##  NA's   :127350   NA's   :127347   NA's   :10560   NA's   :11185  
##  INST_DONDE_DENUNCIO    AGR_SEXO        AGR_EDAD       AGR_ALFAB    
##  Min.   :1.00        Min.   :1.000   Min.   : 7.00   Min.   :1.000  
##  1st Qu.:3.00        1st Qu.:1.000   1st Qu.:26.00   1st Qu.:1.000  
##  Median :4.00        Median :1.000   Median :33.00   Median :1.000  
##  Mean   :3.11        Mean   :1.151   Mean   :34.55   Mean   :1.118  
##  3rd Qu.:4.00        3rd Qu.:1.000   3rd Qu.:40.00   3rd Qu.:1.000  
##  Max.   :6.00        Max.   :2.000   Max.   :98.00   Max.   :2.000  
##  NA's   :303926                                      NA's   :3680   
##  AGR_ESCOLARIDAD  AGR_EST_CIV      AGR_GURPET    AGR_NACIONAL    AGR_TRABAJA   
##  Min.   :10.00   Min.   :1.00    Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:24.00   1st Qu.:2.00    1st Qu.:1.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median :29.00   Median :2.00    Median :1.00   Median :1.000   Median :1.000  
##  Mean   :30.33   Mean   :2.28    Mean   :1.92   Mean   :1.004   Mean   :1.203  
##  3rd Qu.:39.00   3rd Qu.:3.00    3rd Qu.:2.00   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :59.00   Max.   :5.00    Max.   :6.00   Max.   :2.000   Max.   :2.000  
##  NA's   :12841   NA's   :63726   NA's   :5363   NA's   :8525    NA's   :8017   
##     AGR_OCUP       AGR_DEDICA     AGRESORES_OTROS_TOTAL AGR_OTROS_HOM   
##  Min.   : 110    Min.   :1.00     Min.   : 0.0          Min.   :0.00    
##  1st Qu.:5414    1st Qu.:1.00     1st Qu.: 0.0          1st Qu.:0.00    
##  Median :6111    Median :1.00     Median : 0.0          Median :0.00    
##  Mean   :6876    Mean   :1.84     Mean   : 0.2          Mean   :0.07    
##  3rd Qu.:9111    3rd Qu.:3.00     3rd Qu.: 0.0          3rd Qu.:0.00    
##  Max.   :9998    Max.   :6.00     Max.   :15.0          Max.   :8.00    
##  NA's   :85227   NA's   :280879   NA's   :167456        NA's   :167452  
##  AGR_OTRAS_MUJ    AGR_OTROS_N_OS   AGR_OTRAS_N_AS   INST_DENUN_HECHO
##  Min.   :0.0      Min.   :0.00     Min.   :0.00     Min.   :1.000   
##  1st Qu.:0.0      1st Qu.:0.00     1st Qu.:0.00     1st Qu.:3.000   
##  Median :0.0      Median :0.00     Median :0.00     Median :4.000   
##  Mean   :0.1      Mean   :0.02     Mean   :0.01     Mean   :3.439   
##  3rd Qu.:0.0      3rd Qu.:0.00     3rd Qu.:0.00     3rd Qu.:4.000   
##  Max.   :8.0      Max.   :7.00     Max.   :6.00     Max.   :6.000   
##  NA's   :167451   NA's   :167451   NA's   :167451                   
##  ORGANISMO_JURISDICCIONAL   CONDUCENTE     LEY_APLICABLE     ARTICULOVIF1   
##  Min.   : 1.00            Min.   :1.00     Min.   :1.00     Min.   : 1.0    
##  1st Qu.: 1.00            1st Qu.:1.00     1st Qu.:1.00     1st Qu.: 7.0    
##  Median : 1.00            Median :1.00     Median :1.00     Median : 7.0    
##  Mean   : 4.69            Mean   :1.37     Mean   :1.75     Mean   : 6.7    
##  3rd Qu.: 7.00            3rd Qu.:2.00     3rd Qu.:3.00     3rd Qu.: 7.0    
##  Max.   :16.00            Max.   :2.00     Max.   :6.00     Max.   :10.0    
##  NA's   :225602           NA's   :233203   NA's   :159784   NA's   :228391  
##   ARTICULOVIF2     ARTICULOVIF3     ARTICULOVIF4     ARTICULOVCM1   
##  Min.   : 0.00    Min.   : 0.00    Min.   : 0.00    Min.   : 0.00   
##  1st Qu.: 0.00    1st Qu.: 0.00    1st Qu.: 0.00    1st Qu.: 7.00   
##  Median : 0.00    Median : 0.00    Median : 0.00    Median : 7.00   
##  Mean   : 0.36    Mean   : 0.24    Mean   : 0.26    Mean   : 6.57   
##  3rd Qu.: 0.00    3rd Qu.: 0.00    3rd Qu.: 0.00    3rd Qu.: 7.00   
##  Max.   :15.00    Max.   :17.00    Max.   :16.00    Max.   :25.00   
##  NA's   :229166   NA's   :229184   NA's   :228480   NA's   :303468  
##   ARTICULOVCM2     ARTICULOVCM3     ARTICULOVCM4    ARTICULOCODPEN1 
##  Min.   : 0.00    Min.   : 0.00    Min.   : 0.00    Min.   :  4     
##  1st Qu.: 0.00    1st Qu.: 0.00    1st Qu.: 0.00    1st Qu.:203     
##  Median : 0.00    Median : 0.00    Median : 0.00    Median :215     
##  Mean   : 0.57    Mean   : 0.21    Mean   : 0.28    Mean   :322     
##  3rd Qu.: 0.00    3rd Qu.: 0.00    3rd Qu.: 0.00    3rd Qu.:483     
##  Max.   :25.00    Max.   :25.00    Max.   :25.00    Max.   :495     
##  NA's   :301080   NA's   :300434   NA's   :300297   NA's   :336796  
##  ARTICULOCODPEN2  ARTICULOCODPEN3  ARTICULOCODPEN4  ARTICULOTRAS1   
##  Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    Min.   :141.0   
##  1st Qu.:  0.0    1st Qu.:  0.0    1st Qu.:  0.0    1st Qu.:141.0   
##  Median :  0.0    Median :  0.0    Median :  0.0    Median :141.0   
##  Mean   : 29.8    Mean   :  1.6    Mean   :  0.6    Mean   :147.4   
##  3rd Qu.:  0.0    3rd Qu.:  0.0    3rd Qu.:  0.0    3rd Qu.:141.0   
##  Max.   :494.0    Max.   :257.0    Max.   :205.0    Max.   :173.0   
##  NA's   :336796   NA's   :336796   NA's   :336796   NA's   :337519  
##  ARTICULOTRAS2    ARTICULOTRAS3    ARTICULOTRAS4    MEDIDAS_SEGURIDAD
##  Min.   :  0.0    Min.   :0        Min.   :0        Min.   :1        
##  1st Qu.:142.0    1st Qu.:0        1st Qu.:0        1st Qu.:1        
##  Median :142.0    Median :0        Median :0        Median :1        
##  Mean   :113.6    Mean   :0        Mean   :0        Mean   :1        
##  3rd Qu.:142.0    3rd Qu.:0        3rd Qu.:0        3rd Qu.:1        
##  Max.   :142.0    Max.   :0        Max.   :0        Max.   :2        
##  NA's   :337519   NA's   :337519   NA's   :337519   NA's   :160713   
##  TIPO_MEDIDA        ORGANISMO_REMITE  vicRatioAgr     diferenciaEdad    
##  Length:337524      Min.   : 1.00    Min.   :0.0200   Length:337524     
##  Class :character   1st Qu.:17.00    1st Qu.:0.8276   Class :character  
##  Mode  :character   Median :17.00    Median :0.9375   Mode  :character  
##                     Mean   :15.71    Mean   :1.0165                     
##                     3rd Qu.:18.00    3rd Qu.:1.0588                     
##                     Max.   :19.00    Max.   :9.0000                     
##                     NA's   :257336

Con esto podemos decir que el dataset tiene 77 variables y 337524 observaciones.

Con lo cual decidimos dividir el dataset en 2 grupos, uno de validación y otro para entrenamiento, el de validación tiene el 30% de los datos mientras que el de entrenamiento el 70%. Los grupos se ven así:

set.seed(69420)
train_index <- createDataPartition(edad_agr_vic$diferenciaEdad, p = 0.7, list = FALSE)
train_data <- edad_agr_vic[train_index,]
test_data <- edad_agr_vic[-train_index,]

ggplotly(ggplot(train_data, aes(x=diferenciaEdad)) +
  geom_bar(fill = "skyblue") +
  labs(title="Datos de Entrenamiento", x = "Categoria", y = "Cuenta"))
ggplotly(ggplot(test_data, aes(x=diferenciaEdad)) +
  geom_bar(fill = "orange") +
  labs(title = "Datos de Validacion", x = "Categoria", y = "Cuenta"))

Como se puede ver existe una alta desigualdad en los datos, puesto que la gran mayoría de los casos se dan cuando la víctima tiene una edad similar al agresor. Definitivamente esto será algo a tomar en cuenta durante el entrenamiento del modelo, uno de las posibles optimizaciones a evaluar podría ser balancear la data de entrenamiento para mejorar la precisión.

Uso del Algoritmo

Modelos

Discusión